/*******************************************************************************
																				
	DESCRIPTION:  	This do file produces binned scatterplots of empirical JFR 
					on predicted JFR for the underlying ML models.
	
*******************************************************************************/

clear all
global id_code 113_3

local vars _
local model Full
local year 2006

 * Load the data
use "${data}/003_MainWithEnsemblePred_`model'`vars'`year'.dta", clear

foreach outcome in emplAft6M_0M_In emplAft6M_6M_In emplAft6M_12M_In  {

	foreach model in "" _rf _boost _lasso {

		preserve 
		keep if `outcome' != .

		* Regress empirical JFR on predicted JFR and store coefficients and R-squared
		reg `outcome' p_`outcome'`model'
		local b0 : display %4.2f _b[_cons]
		local b1 : display %4.2f _b[p_`outcome'`model']
		local se0 : display %4.2f _se[_cons]
		local se1 : display %4.2f _se[p_`outcome'`model']
		local r2 : display %5.3f `e(r2)'
		local n : display %9.0fc `e(N)'
		
		* Now we generate bins with equal mass (i.e., quantiles):
		xtile bin = p_`outcome'`model', nq(20)
		
		* Calculated the average empirical JFR in each bin
		collapse (mean) `outcome' p_`outcome'`model', by(bin)

		* Predict the JFR based on the coefficients from the regression and bins
		predict prediction_line 
		
		* Generate helper for graph subtitle:
		if "`model'" == "" { 
			local sub "Ensemble"
		}
		if "`model'" == "_pos" { 
			local sub "Ensemble (positive weights)"
		}
		if "`model'" == "_nsp" { 
			local sub "Ensemble (no spline, 6% weights)"
		}
		if "`model'" == "_nspw" { 
			local sub "Ensemble (no spline, 11.4% weights)"
		}
		else if "`model'" == "_rf" {
			local sub "R. Forest"
		}
		else if "`model'" == "_boost" {
			local sub "B. Gradient"
		}
		else if "`model'" == "_lasso" {
			local sub "LASSO"
		}

		* Plot the average empirical JFR in each bin of predicted JFR
		twoway ///
			(line prediction_line p_`outcome'`model', lcolor(orange_red)) (function y=x, lcolor(gs7) lpattern(dash)) ///
			(scatter `outcome' p_`outcome'`model', mfcolor(ebblue) mlcolor(navy)), ///
			graphregion(color(white)) ///
			legend( ///
				order( ///
					/* 1 "{bf:Regression Line}" */ ///
					- "Intercept = `b0' (`se0')" ///
					- "Slope = `b1' (`se1')" ///
					- "N = `n'") ///
				symxsize(*0.2) size(vsmall) cols(1) pos(4) ring(0))		///
			subtitle("`sub'") xtitle("") ///
			ylabel(0(0.2)1, angle(0)) ///
			xscale(titlegap(2)) yscale(titlegap(2)) ///
			name(gph`model', replace)		
		
		restore
		
	}

	* Generate helper for axis label:
		if "`outcome'" == "emplAft6M_12M_In" { 
			local jfr "6-Month JFR 12 Months into Spell"
		}
		else if "`outcome'" == "emplAft6M_6M_In" {
			local jfr "6-Month JFR 6 Months into Spell"
		}
		else if "`outcome'" == "emplAft6M_0M_In" {
			local jfr "6-Month JFR at Start of Spell"
		}

	* Combine all four models into one graph:
	graph combine gph gph_rf gph_boost gph_lasso, ///
		l1("Average `jfr'") b1("Predicted `jfr'") ///
		ycommon xcommon ///
		graphregion(color(white)) name(scat_ML_`outcome', replace)

	graph export "${output}/${id_code}_Predicted_`outcome'_onEmpirical_ML_Robustness_`model'_`year'.pdf", as(pdf) replace 	
	
}